Exploratory Data Analysis: Data & Analytics Job Market

Code
import plotly.io as pio
pio.renderers.default = "plotly_mimetype+notebook"

Research Question: What does the job market look like for Business Analytics, Data Science, and ML professionals in 2024?

Narrative Flow: 1. Who’s hiring? → Top companies and industries 2. What roles exist? → Job titles within our occupation categories 3. How long do postings stay open? → Duration analysis by occupation 4. What do they want? → Skills in demand 5. What drives salary? → Key factors affecting compensation

Each insight builds toward our ML modeling decisions.

Code
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go
from collections import Counter

pd.set_option('display.max_columns', None)

# Load data
df = pd.read_csv('data/lightcast_cleaned.csv')
df['POSTED'] = pd.to_datetime(df['POSTED'])

# Exclude unclassified/unknown values for cleaner analysis
df_clean = df[~df['NAICS_2022_2_NAME'].str.contains('Unclassified', na=False)].copy()

print(f"Dataset: {len(df):,} job postings")
print(f"Date range: {df['POSTED'].min().strftime('%b %Y')} - {df['POSTED'].max().strftime('%b %Y')}")
print(f"Occupations: {df['LOT_V6_OCCUPATION_NAME'].nunique()}")
Dataset: 56,773 job postings
Date range: May 2024 - Sep 2024
Occupations: 4
Code
# Top 10 Industries (excluding unclassified)
industry_counts = df_clean['NAICS_2022_2_NAME'].value_counts().head(10)

# Top 10 Companies
company_counts = df['COMPANY_NAME'].value_counts().head(10)

# Create side-by-side subplots
fig = go.Figure()

fig.add_trace(go.Bar(
    y=industry_counts.index,
    x=industry_counts.values,
    orientation='h',
    marker_color='steelblue',
    name='Industries'
))

fig.update_layout(
    title='Top 10 Industries Hiring Data Professionals',
    xaxis_title='Number of Job Postings',
    yaxis={'categoryorder': 'total ascending'},
    template='plotly_white',
    height=450,
    showlegend=False
)
fig.write_image('figures/top_industries.png', scale=2)
fig.show()

# Print top companies
print("\nTop 10 Companies by Job Postings:")
for i, (company, count) in enumerate(company_counts.items(), 1):
    print(f"  {i}. {company}: {count:,} postings")

Top 10 Companies by Job Postings:
  1. Unclassified: 3,025 postings
  2. Deloitte: 2,352 postings
  3. Accenture: 1,318 postings
  4. PricewaterhouseCoopers: 697 postings
  5. Merit America: 444 postings
  6. Insight Global: 365 postings
  7. Cardinal Health: 346 postings
  8. Chewy: 319 postings
  9. Smx Corporation Limited: 317 postings
  10. Robert Half: 313 postings
Code
# Top job titles within each occupation
fig = go.Figure()

colors = ['#1f77b4', '#ff7f0e', '#2ca02c', '#d62728']
occupations = df['LOT_V6_OCCUPATION_NAME'].unique()

for i, occ in enumerate(occupations):
    df_occ = df[df['LOT_V6_OCCUPATION_NAME'] == occ]
    top_titles = df_occ['TITLE_NAME'].value_counts().head(5)
    
    fig.add_trace(go.Bar(
        name=occ,
        y=[f"{occ[:15]}... - {title[:25]}" for title in top_titles.index],
        x=top_titles.values,
        orientation='h',
        marker_color=colors[i]
    ))

fig.update_layout(
    title='Top 5 Job Titles per Occupation Category',
    xaxis_title='Number of Postings',
    yaxis={'categoryorder': 'total ascending'},
    template='plotly_white',
    height=600,
    showlegend=True,
    legend_title_text='Occupation',
    barmode='stack'
)
fig.write_image('figures/job_titles_by_occupation.png', scale=2)
fig.show()

# Summary counts
print("\nPostings per Occupation:")
for occ, count in df['LOT_V6_OCCUPATION_NAME'].value_counts().items():
    print(f"  • {occ}: {count:,} ({count/len(df)*100:.1f}%)")

Postings per Occupation:
  • Data / Data Mining Analyst: 26,718 (47.1%)
  • Business Intelligence Analyst: 26,266 (46.3%)
  • Business / Management Analyst: 3,657 (6.4%)
  • Market Research Analyst: 132 (0.2%)
Code
# Duration by Occupation
fig = px.box(
    df,
    x='LOT_V6_OCCUPATION_NAME',
    y='DURATION',
    color='LOT_V6_OCCUPATION_NAME',
    title='Job Posting Duration by Occupation Type',
    labels={'LOT_V6_OCCUPATION_NAME': '', 'DURATION': 'Days Posted'}
)
fig.update_layout(
    template='plotly_white',
    showlegend=False,
    height=450,
    xaxis_tickangle=-15
)
fig.write_image('figures/duration_by_occupation.png', scale=2)
fig.show()

# Summary stats
print("\nDuration Statistics by Occupation (days):")
duration_stats = df.groupby('LOT_V6_OCCUPATION_NAME')['DURATION'].agg(['median', 'mean', 'std']).round(1)
duration_stats.columns = ['Median', 'Mean', 'Std Dev']
print(duration_stats.sort_values('Median'))

# Overall
print(f"\nOverall median duration: {df['DURATION'].median():.0f} days")

Duration Statistics by Occupation (days):
                               Median  Mean  Std Dev
LOT_V6_OCCUPATION_NAME                              
Business Intelligence Analyst    30.0  34.9     24.1
Data / Data Mining Analyst       30.0  34.8     23.7
Business / Management Analyst    33.0  38.1     23.3
Market Research Analyst          34.0  37.4     23.6

Overall median duration: 30 days
Code
# Extract skills
def extract_skills(skills_series):
    all_skills = []
    for skills in skills_series.dropna():
        if isinstance(skills, str) and skills not in ['Not Listed', '']:
            all_skills.extend([s.strip() for s in skills.split(',')])
    return Counter(all_skills)

# General skills
skill_counts = extract_skills(df['SKILLS_NAME'])
top_skills = pd.DataFrame(skill_counts.most_common(15), columns=['Skill', 'Count'])
top_skills['Percentage'] = (top_skills['Count'] / len(df) * 100).round(1)

# Software skills
software_counts = extract_skills(df['SOFTWARE_SKILLS_NAME'])
top_software = pd.DataFrame(software_counts.most_common(10), columns=['Software', 'Count'])
top_software['Percentage'] = (top_software['Count'] / len(df) * 100).round(1)

# Combined visualization
fig = go.Figure()

fig.add_trace(go.Bar(
    y=top_skills['Skill'],
    x=top_skills['Percentage'],
    orientation='h',
    marker_color='#2E86AB',
    name='General Skills'
))

fig.update_layout(
    title='Top 15 Skills in Demand (% of All Job Postings)',
    xaxis_title='% of Postings Requiring This Skill',
    yaxis={'categoryorder': 'total ascending'},
    template='plotly_white',
    height=500,
    showlegend=False
)
fig.write_image('figures/top_skills.png', scale=2)
fig.show()

# Print top software
print("\nTop 10 Software/Technical Skills:")
for i, row in top_software.iterrows():
    print(f"  {i+1}. {row['Software']}: {row['Percentage']}%")

Top 10 Software/Technical Skills:
  1. SQL (Programming Language): 32.5%
  2. Microsoft Excel: 19.7%
  3. SAP Applications: 18.8%
  4. Tableau (Business Intelligence Software): 18.3%
  5. Dashboard: 18.0%
  6. Python (Programming Language): 17.9%
  7. Power BI: 16.7%
  8. Microsoft Office: 11.3%
  9. Microsoft PowerPoint: 10.7%
  10. R (Programming Language): 9.0%
Code
# Salary by Occupation AND Remote Type (multi-factor view)
df_remote = df[df['REMOTE_TYPE_NAME'] != 'Not Specified'].copy()

fig = px.box(
    df_remote,
    x='LOT_V6_OCCUPATION_NAME',
    y='SALARY',
    color='REMOTE_TYPE_NAME',
    title='Salary by Occupation and Remote Work Type',
    labels={
        'LOT_V6_OCCUPATION_NAME': '',
        'SALARY': 'Annual Salary ($)',
        'REMOTE_TYPE_NAME': 'Work Type'
    }
)
fig.update_layout(
    template='plotly_white',
    height=500,
    xaxis_tickangle=-15,
    legend_title_text='Remote Type'
)
fig.write_image('figures/salary_by_occupation_remote.png', scale=2)
fig.show()

# Summary table
print("\nMedian Salary by Occupation & Remote Type:")
pivot = df_remote.pivot_table(
    values='SALARY',
    index='LOT_V6_OCCUPATION_NAME',
    columns='REMOTE_TYPE_NAME',
    aggfunc='median'
).round(0)
print(pivot.applymap(lambda x: f"${x:,.0f}" if pd.notna(x) else "N/A"))

Median Salary by Occupation & Remote Type:
REMOTE_TYPE_NAME              Hybrid Remote Not Remote    Remote
LOT_V6_OCCUPATION_NAME                                          
Business / Management Analyst       $97,250    $97,250   $97,250
Business Intelligence Analyst      $125,900   $125,900  $125,900
Data / Data Mining Analyst          $95,300    $95,300   $95,300
Market Research Analyst             $94,500        N/A   $94,500
/tmp/ipykernel_7603/976825870.py:33: FutureWarning:

DataFrame.applymap has been deprecated. Use DataFrame.map instead.